library(timetk)
library(inspectdf)
library(janitor)
library(readr)
library(dplyr)
library(readr)
library(ggplot2)
library(naniar)
library(packcircles)
library(ggridges)
library(ggbeeswarm)
library(patchwork)
library(dplyr)
library(tidyr)
library(readr)
library(skimr)
library(purrr)
library(stringr)
library(urltools)
library(magrittr)
library(lubridate)
library(janitor)
library(tseries)

Bevezetés

Az adatsor Tetouan City 2017-es áramfogyasztásának alakulását írja le 10 percenkénti bontásban 3 zónára.

Hivatkozás

Salam, A., & El Hibaoui, A. (2018, December). Comparison of Machine Learning Algorithms for the Power Consumption Prediction:-Case Study of Tetouan city–. In 2018 6th International Renewable and Sustainable Energy Conference (IRSEC) (pp. 1-5). IEEE

Változók

  • Date Time: 2017.01.01. 00:00 órától 10 percenként 2017.12.30.24:00-ig.
  • Temperature: Hőmérséklet a városban
  • Humidity: Páratartalom
  • Wind Speed: szél sebesség
  • general diffuse flows: általános diffúz áramlás
  • diffuse flows: diffúz áramlás
  • power consumption of zone 1 of Tetouan city: 1. zóna fogyasztéás
  • power consumption of zone 2 of Tetouan city: 2. zóna fogyasztás
  • power consumption of zone 3 of Tetouan city: 3. zóna fogyasztás

Cél

az áramfogyasztás előrejelzése idősor elemzéssel

Adatok betöltése

url <-
  "https://raw.githubusercontent.com/fzksmrk/adatbanyaszati-beadando-2/main/data.csv"
data <- readr::read_csv(
  url,
  col_types = cols(
    DateTime = col_datetime(format = "%m/%d/%Y %H:%M"),
    Temperature = col_double(),
    Humidity = col_double(),
    `Wind Speed` = col_double(),
    `general diffuse flows` = col_double(),
    `diffuse flows` = col_double(),
    `Zone 1 Power Consumption` = col_double(),
    `Zone 2  Power Consumption` = col_double(),
    `Zone 3  Power Consumption` = col_double()
  )
)

data
data <- janitor::clean_names(data)
data <- dplyr::rename(data,"zone_1" = "zone_1_power_consumption","zone_2" = "zone_2_power_consumption","zone_3" = "zone_3_power_consumption")
data

Új változók létrehozása

Teljes áramfelhasználás

data <- data %>%
  mutate(data, zone_total = zone_1 + zone_2 + zone_3)

Idő bővítés

data <- data %>%
  timetk::tk_augment_timeseries_signature(date_time) %>%
  select(
    -matches(
      "(half)|(mday)|(qday)|(mday)|(yday)|(mweek)|(xts)|(second)|(minute)|(iso)|(num)|(hour12)|(am.pm)|(week\\d)|(mday7)"
    )
  ) %>%
  select(-diff,-wday) %>%
  mutate(hour = factor(hour, ordered = TRUE))

Adatok vizsgálata

inspectdf::inspect_num(data)

Általános változók

data %>% 
  ggplot(aes(date_time, temperature)) +
  geom_line() +
  labs(
    x = NULL,
    y = "Temperature"
  ) -> p1

data %>% 
  ggplot(aes(date_time, humidity)) +
  geom_line() +
  labs(
    x = NULL,
    y = "Humidity"
  ) -> p2

data %>% 
  ggplot(aes(date_time, wind_speed)) +
  geom_line() +
  labs(
    x = NULL,
    y = "Wind Speed"
  ) -> p3

data %>% 
  select(date_time, contains("flows")) %>% 
  pivot_longer(-date_time) %>% 
  ggplot(aes(date_time, value)) +
  geom_line(aes(color = name)) +
  labs(
    x = NULL,
    y = "Flows",
    color = "Flows"
  ) -> p4

p1 / p2 / p3 / p4

Zónák energia felhasználása

data %>% 
  select(date_time, contains("zone")) %>% 
  pivot_longer(-date_time) %>% 
  ggplot(aes(date_time, value)) +
  geom_line(aes(color = name)) +
  scale_y_continuous(labels = scales::label_number_si()) +
  labs(
    x = NULL,
    y = "Power",
    color = "Zone"
  )

boxplot(data$zone_1, main = "Zone 1", ylab = "Value")

boxplot(data$zone_2, main = "Zone 2", ylab = "Value")

boxplot(data$zone_3, main = "Zone 3", ylab = "Value")

boxplot(data$zone_total, main = "Zone Total", ylab = "Value")

data %>% 
  select(hour, zone_1,zone_2,zone_3) %>% 
  pivot_longer(-hour) %>% 
  ggplot(aes(hour, value)) +
  geom_boxplot(aes(color = name),
               outlier.alpha = 0.1) -> p1

data %>% 
  select(wday.lbl, zone_1,zone_2,zone_3) %>% 
  pivot_longer(-wday.lbl) %>% 
  ggplot(aes(wday.lbl, value)) +
  geom_boxplot(aes(color = name),
               outlier.alpha = 0.1) -> p2

data %>% 
  select(month.lbl, zone_1,zone_2,zone_3) %>% 
  pivot_longer(-month.lbl) %>% 
  ggplot(aes(month.lbl, value)) +
  geom_boxplot(aes(color = name),
               outlier.alpha = 0.1) -> p3

p1/ p2 / p3

Decompose

decompose_ts <- function(series, freq = 144) {
  ts_obj <- ts(series, frequency = freq)
  decompose(ts_obj)
}
plot(decompose_ts(data$zone_1))

plot(decompose_ts(data$zone_2))

plot(decompose_ts(data$zone_3))

plot(decompose_ts(data$temperature))

plot(decompose_ts(data$wind_speed))

plot(decompose_ts(data$humidity))

plot(decompose_ts(data$general_diffuse_flows))

plot(decompose_ts(data$diffuse_flows))

Prediktív elemzés

Zone 1

az áramfogyasztás előrejelzése idősor elemzéssel

Hozzuk létre az idősorunkat

zone_1_ts <- ts(data$zone_1, frequency = 144)

Elemezzük, hogy az adatunk stacionárius-e

adf_test <- adf.test(zone_1_ts)
## Warning in adf.test(zone_1_ts): p-value smaller than printed p-value
print(adf_test$p.value)
## [1] 0.01

Mivel a p érték kevesebb, mint 0.05, ezért a nullhipotézist elvethetjük és elfogadhatjuk, hogy az adatunk stacionárius

acf(zone_1_ts)

pacf(zone_1_ts)

library(forecast)
fit <- auto.arima(zone_1_ts, seasonal = TRUE)
summary(fit)
## Series: zone_1_ts 
## ARIMA(3,0,0)(0,1,0)[144] 
## 
## Coefficients:
##          ar1      ar2     ar3
##       1.0507  -0.1566  0.0816
## s.e.  0.0044   0.0063  0.0044
## 
## sigma^2 = 187691:  log likelihood = -391528.2
## AIC=783064.5   AICc=783064.5   BIC=783100
## 
## Training set error measures:
##                    ME     RMSE      MAE         MPE      MAPE      MASE
## Training set 0.162183 432.6257 264.0267 -0.01083862 0.8469455 0.2029934
##                       ACF1
## Training set -0.0007429365
forecast_values <- forecast(fit, h = 77*2*5)

lasts <- tail(zone_1_ts, 77*2*10)

# Create a data frame for the historical data
history_df <- data.frame(Time = time(lasts), Value = as.numeric(lasts), Lo95 = NA, Hi95 = NA, Type = "Historical")

# Create a data frame for the forecasted data
forecast_df <- data.frame(Time = time(forecast_values$mean), 
                          Value = as.numeric(forecast_values$mean), 
                          Lo95 = as.numeric(forecast_values$lower[,2]),
                          Hi95 = as.numeric(forecast_values$upper[,2]),
                          Type = "Forecast")

# Ensure that 'Time' columns are of the same class
history_df$Time <- as.numeric(history_df$Time)
forecast_df$Time <- as.numeric(forecast_df$Time)

# Combine the historical and forecasted data into a single data frame
combined_df <- rbind(history_df, forecast_df)

# Create a ggplot of the historical and forecasted values
ggplot() +
  geom_line(data = combined_df, aes(x = Time, y = Value, color = Type)) +
  geom_line(data = forecast_df, aes(x = Time, y = Lo95), linetype = "dotted", color = "blue") +
  geom_line(data = forecast_df, aes(x = Time, y = Hi95), linetype = "dotted", color = "blue") +
  scale_color_manual(values = c("Historical" = "black", "Forecast" = "red")) +
  labs(title = "ARIMA Forecast", x = "Time", y = "Value") +
  theme_minimal()

Total (1+2+3)

zone_ts <- ts(data$zone_total, frequency = 144)

fit_zone <- auto.arima(zone_ts, seasonal = TRUE)
summary(fit_zone)
## Series: zone_ts 
## ARIMA(4,0,0)(0,1,0)[144] 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4
##       1.0733  -0.1280  0.0500  -0.0108
## s.e.  0.0044   0.0064  0.0064   0.0044
## 
## sigma^2 = 349552:  log likelihood = -407780.2
## AIC=815570.4   AICc=815570.4   BIC=815614.7
## 
## Training set error measures:
##                     ME     RMSE      MAE          MPE      MAPE      MASE
## Training set 0.2655549 590.3941 406.7471 -0.004305024 0.5934085 0.1754785
##                      ACF1
## Training set 4.183627e-05
forecast_values <- forecast(fit_zone, h = 77*2*5)

lasts <- tail(zone_ts, 77*2*10)

# Create a data frame for the historical data
history_df <- data.frame(Time = time(lasts), Value = as.numeric(lasts), Lo95 = NA, Hi95 = NA, Type = "Historical")

# Create a data frame for the forecasted data
forecast_df <- data.frame(Time = time(forecast_values$mean), 
                          Value = as.numeric(forecast_values$mean), 
                          Lo95 = as.numeric(forecast_values$lower[,2]),
                          Hi95 = as.numeric(forecast_values$upper[,2]),
                          Type = "Forecast")

# Ensure that 'Time' columns are of the same class
history_df$Time <- as.numeric(history_df$Time)
forecast_df$Time <- as.numeric(forecast_df$Time)

# Combine the historical and forecasted data into a single data frame
combined_df <- rbind(history_df, forecast_df)

# Create a ggplot of the historical and forecasted values
ggplot() +
  geom_line(data = combined_df, aes(x = Time, y = Value, color = Type)) +
  geom_line(data = forecast_df, aes(x = Time, y = Lo95), linetype = "dotted", color = "blue") +
  geom_line(data = forecast_df, aes(x = Time, y = Hi95), linetype = "dotted", color = "blue") +
  scale_color_manual(values = c("Historical" = "black", "Forecast" = "red")) +
  labs(title = "ARIMA Forecast", x = "Time", y = "Value") +
  theme_minimal()